EDA¶

  • Jose Abal Caamaño
  • Jesús Platero Acevedo

Objective of the case¶

To realize an exploratory data analysis of dataset ticdata.

In [1]:
#Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px
import requests
import pandas_profiling
from pandas_profiling.utils.cache import cache_file
from sklearn.impute import KNNImputer
import scipy.stats as ss
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 5000)
In [2]:
#Read the datasets and create our data frames
df_ticdata = pd.read_csv("/Users/joseabalcaamano/Desktop/MachineLearning/PracticaEDA/data/raw/ticdata2000.txt", sep = "\t", header = None)
df_ticeval = pd.read_csv("/Users/joseabalcaamano/Desktop/MachineLearning/PracticaEDA/data/raw/ticeval2000.txt", sep = "\t", header = None)
df_tictgts = pd.read_csv("/Users/joseabalcaamano/Desktop/MachineLearning/PracticaEDA/data/raw/tictgts2000.txt", sep = "\t", header = None)
headers = list(range(1,87))
df_ticdata.columns = headers

headers = list(range(1,87))
df_ticdata.columns = headers

#Add a new column differentiating the data in train and test. For us, our train is ticdata.
df_ticdata['87'] = 1
df_tictgts['87'] = 0
In [3]:
#To know the rows and columns of our dataset.
df_ticdata.shape
Out[3]:
(5822, 87)
In [4]:
#Check if they are the same rows.
print(len(df_ticdata), 
      len(df_ticeval), 
      len(df_tictgts))
5822 4000 4000
In [5]:
#There are 86 variables in our dataset ticdata, divided in categoricals and numericals, and the number 86, CARAVAN,
#is our target variable 
list_table = ["MOSTYPE", "MAANTHUI", "MGEMOMV",
                        "MGEMLEEF", "MOSHOOFD", "MGODRK",
                        "MGODPR", "MGODOV", "MGODGE",
                        "MRELGE", "MRELSA", "MRELOV", "MFALLEEN",
                        "MFGEKIND", "MFWEKIND", "MOPLHOOG", "MOPLMIDD",
                        "MOPLLAAG", "MBERHOOG", "MBERZELF",
                        "MBERBOER", "MBERMIDD", "MBERARBG", "MBERARBO",
                        "MSKA","MSKB1", "MSKB2", "MSKC",
                        "MSKD", "MHHUUR", "MHKOOP", "MAUT1", "MAUT2",
                        "MAUT0", "MZFONDS", "MZPART", "MINKM30",
                        "MINK3045", "MINK4575", "MINK7512", "MINK123M",
                        "MINKGEM", "MKOOPKLA", "PWAPART", "PWABEDR", "PWALAND",
                        "PPERSAUT", "PBESAUT", "PMOTSCO", "PVRAAUT",
                        "PAANHANG", "PTRACTOR", "PWERKT", "PBROM",
                        "PLEVEN", "PPERSONG", "PGEZONG", "PWAOREG",
                        "PBRAND", "PZEILPL", "PPLEZIER", "PFIETS","PINBOED", "PBYSTAND","AWAPART","AWABEDR",
                        "AWALAND","APERSAUT","ABESAUT","AMOTSCO","AVRAAUT","AAANHANG","ATRACTOR",
                         "AWERKT","ABROM","ALEVEN","APERSONG","AGEZONG","AWAOREG","ABRAND","AZEILPL","APLEZIER",
                         "AFIETS", "AINBOED", "ABYSTAND", "CARAVAN", "TRAIN"]
list_other_var = ['title']
list_tictgts = ["CARAVAN"]
len(list_table)
Out[5]:
87
In [6]:
list_tictgts = ["CARAVAN", "TRAIN"]
list_var = ['title']
len(list_tictgts)
Out[6]:
2
In [7]:
df_ticdata.columns= list_table
df_tictgts.columns = list_tictgts
df_ticeval.columns = list_table[:-2]
df_ticdata
Out[7]:
MOSTYPE MAANTHUI MGEMOMV MGEMLEEF MOSHOOFD MGODRK MGODPR MGODOV MGODGE MRELGE MRELSA MRELOV MFALLEEN MFGEKIND MFWEKIND MOPLHOOG MOPLMIDD MOPLLAAG MBERHOOG MBERZELF MBERBOER MBERMIDD MBERARBG MBERARBO MSKA MSKB1 MSKB2 MSKC MSKD MHHUUR MHKOOP MAUT1 MAUT2 MAUT0 MZFONDS MZPART MINKM30 MINK3045 MINK4575 MINK7512 MINK123M MINKGEM MKOOPKLA PWAPART PWABEDR PWALAND PPERSAUT PBESAUT PMOTSCO PVRAAUT PAANHANG PTRACTOR PWERKT PBROM PLEVEN PPERSONG PGEZONG PWAOREG PBRAND PZEILPL PPLEZIER PFIETS PINBOED PBYSTAND AWAPART AWABEDR AWALAND APERSAUT ABESAUT AMOTSCO AVRAAUT AAANHANG ATRACTOR AWERKT ABROM ALEVEN APERSONG AGEZONG AWAOREG ABRAND AZEILPL APLEZIER AFIETS AINBOED ABYSTAND CARAVAN TRAIN
0 33 1 3 2 8 0 5 1 3 7 0 2 1 2 6 1 2 7 1 0 1 2 5 2 1 1 2 6 1 1 8 8 0 1 8 1 0 4 5 0 0 4 3 0 0 0 6 0 0 0 0 0 0 0 0 0 0 0 5 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1
1 37 1 2 2 8 1 4 1 4 6 2 2 0 4 5 0 5 4 0 0 0 5 0 4 0 2 3 5 0 2 7 7 1 2 6 3 2 0 5 2 0 5 4 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1
2 37 1 2 2 8 0 4 2 4 3 2 4 4 4 2 0 5 4 0 0 0 7 0 2 0 5 0 4 0 7 2 7 0 2 9 0 4 5 0 0 0 3 4 2 0 0 6 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1
3 9 1 3 3 3 2 3 2 4 5 2 2 2 3 4 3 4 2 4 0 0 3 1 2 3 2 1 4 0 5 4 9 0 0 7 2 1 5 3 0 0 4 4 0 0 0 6 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1
4 40 1 4 2 10 1 4 1 4 7 1 2 2 4 4 5 4 0 0 5 4 0 0 0 9 0 0 0 0 4 5 6 2 1 5 4 0 0 9 0 0 6 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5817 36 1 1 2 8 0 6 1 2 1 2 6 5 3 2 2 5 2 2 0 0 4 1 3 2 3 3 3 0 9 0 5 1 3 5 4 4 3 3 0 0 3 3 2 0 0 6 0 4 0 0 0 0 0 3 0 0 0 3 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 2 0 0 0 1 0 0 0 0 0 0 1
5818 35 1 4 4 8 1 4 1 4 6 0 3 2 2 5 0 0 9 2 1 1 3 3 2 0 4 5 0 0 3 6 6 1 2 6 3 0 9 0 0 0 4 5 0 0 0 0 0 0 0 1 0 0 3 0 0 0 0 5 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1
5819 33 1 3 4 8 0 6 0 3 5 1 4 3 3 4 0 1 8 1 0 0 2 3 5 1 1 1 4 4 7 2 4 0 5 8 1 5 3 1 1 0 3 3 2 0 0 6 0 0 0 0 0 0 0 0 0 0 0 4 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1
5820 34 1 3 2 8 0 7 0 2 7 2 0 0 4 5 0 2 7 0 2 0 2 4 2 0 0 4 5 0 2 7 5 4 0 9 0 0 5 4 0 0 4 6 0 0 0 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
5821 33 1 3 3 8 0 6 1 2 7 1 2 1 4 4 1 2 6 1 0 1 3 2 4 1 1 2 6 1 5 4 5 2 3 6 3 2 5 2 1 0 3 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1

5822 rows × 87 columns

In [8]:
print(len(df_ticdata.columns), 
      len(df_ticeval.columns), 
      len(df_tictgts.columns))
87 85 2
In [9]:
#add the column CARAVAN in the other dataset ticeval
df_ticeval = pd.concat([df_ticeval, df_tictgts], axis = 1, join ="inner")
In [10]:
#Reset our index
df_ticdata = df_ticdata.reset_index(col_fill='').drop(['index'], axis=1)
df_ticdata
Out[10]:
MOSTYPE MAANTHUI MGEMOMV MGEMLEEF MOSHOOFD MGODRK MGODPR MGODOV MGODGE MRELGE MRELSA MRELOV MFALLEEN MFGEKIND MFWEKIND MOPLHOOG MOPLMIDD MOPLLAAG MBERHOOG MBERZELF MBERBOER MBERMIDD MBERARBG MBERARBO MSKA MSKB1 MSKB2 MSKC MSKD MHHUUR MHKOOP MAUT1 MAUT2 MAUT0 MZFONDS MZPART MINKM30 MINK3045 MINK4575 MINK7512 MINK123M MINKGEM MKOOPKLA PWAPART PWABEDR PWALAND PPERSAUT PBESAUT PMOTSCO PVRAAUT PAANHANG PTRACTOR PWERKT PBROM PLEVEN PPERSONG PGEZONG PWAOREG PBRAND PZEILPL PPLEZIER PFIETS PINBOED PBYSTAND AWAPART AWABEDR AWALAND APERSAUT ABESAUT AMOTSCO AVRAAUT AAANHANG ATRACTOR AWERKT ABROM ALEVEN APERSONG AGEZONG AWAOREG ABRAND AZEILPL APLEZIER AFIETS AINBOED ABYSTAND CARAVAN TRAIN
0 33 1 3 2 8 0 5 1 3 7 0 2 1 2 6 1 2 7 1 0 1 2 5 2 1 1 2 6 1 1 8 8 0 1 8 1 0 4 5 0 0 4 3 0 0 0 6 0 0 0 0 0 0 0 0 0 0 0 5 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1
1 37 1 2 2 8 1 4 1 4 6 2 2 0 4 5 0 5 4 0 0 0 5 0 4 0 2 3 5 0 2 7 7 1 2 6 3 2 0 5 2 0 5 4 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1
2 37 1 2 2 8 0 4 2 4 3 2 4 4 4 2 0 5 4 0 0 0 7 0 2 0 5 0 4 0 7 2 7 0 2 9 0 4 5 0 0 0 3 4 2 0 0 6 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1
3 9 1 3 3 3 2 3 2 4 5 2 2 2 3 4 3 4 2 4 0 0 3 1 2 3 2 1 4 0 5 4 9 0 0 7 2 1 5 3 0 0 4 4 0 0 0 6 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1
4 40 1 4 2 10 1 4 1 4 7 1 2 2 4 4 5 4 0 0 5 4 0 0 0 9 0 0 0 0 4 5 6 2 1 5 4 0 0 9 0 0 6 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5817 36 1 1 2 8 0 6 1 2 1 2 6 5 3 2 2 5 2 2 0 0 4 1 3 2 3 3 3 0 9 0 5 1 3 5 4 4 3 3 0 0 3 3 2 0 0 6 0 4 0 0 0 0 0 3 0 0 0 3 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 2 0 0 0 1 0 0 0 0 0 0 1
5818 35 1 4 4 8 1 4 1 4 6 0 3 2 2 5 0 0 9 2 1 1 3 3 2 0 4 5 0 0 3 6 6 1 2 6 3 0 9 0 0 0 4 5 0 0 0 0 0 0 0 1 0 0 3 0 0 0 0 5 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1
5819 33 1 3 4 8 0 6 0 3 5 1 4 3 3 4 0 1 8 1 0 0 2 3 5 1 1 1 4 4 7 2 4 0 5 8 1 5 3 1 1 0 3 3 2 0 0 6 0 0 0 0 0 0 0 0 0 0 0 4 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1
5820 34 1 3 2 8 0 7 0 2 7 2 0 0 4 5 0 2 7 0 2 0 2 4 2 0 0 4 5 0 2 7 5 4 0 9 0 0 5 4 0 0 4 6 0 0 0 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
5821 33 1 3 3 8 0 6 1 2 7 1 2 1 4 4 1 2 6 1 0 1 3 2 4 1 1 2 6 1 5 4 5 2 3 6 3 2 5 2 1 0 3 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1

5822 rows × 87 columns

In [11]:
#Observ the duplicates of the dataset ticdata
print(df_ticdata.shape, df_ticdata.drop_duplicates().shape)
(5822, 87) (5220, 87)
In [12]:
#Missing values
pd.isnull(df_ticdata[[x for x in df_ticdata.columns[pd.isna(df_ticdata).any()].tolist()]]).sum()
Out[12]:
Series([], dtype: float64)
In [13]:
#Look the type of all variables
df_ticdata.dtypes.to_dict()
Out[13]:
{'MOSTYPE': dtype('int64'),
 'MAANTHUI': dtype('int64'),
 'MGEMOMV': dtype('int64'),
 'MGEMLEEF': dtype('int64'),
 'MOSHOOFD': dtype('int64'),
 'MGODRK': dtype('int64'),
 'MGODPR': dtype('int64'),
 'MGODOV': dtype('int64'),
 'MGODGE': dtype('int64'),
 'MRELGE': dtype('int64'),
 'MRELSA': dtype('int64'),
 'MRELOV': dtype('int64'),
 'MFALLEEN': dtype('int64'),
 'MFGEKIND': dtype('int64'),
 'MFWEKIND': dtype('int64'),
 'MOPLHOOG': dtype('int64'),
 'MOPLMIDD': dtype('int64'),
 'MOPLLAAG': dtype('int64'),
 'MBERHOOG': dtype('int64'),
 'MBERZELF': dtype('int64'),
 'MBERBOER': dtype('int64'),
 'MBERMIDD': dtype('int64'),
 'MBERARBG': dtype('int64'),
 'MBERARBO': dtype('int64'),
 'MSKA': dtype('int64'),
 'MSKB1': dtype('int64'),
 'MSKB2': dtype('int64'),
 'MSKC': dtype('int64'),
 'MSKD': dtype('int64'),
 'MHHUUR': dtype('int64'),
 'MHKOOP': dtype('int64'),
 'MAUT1': dtype('int64'),
 'MAUT2': dtype('int64'),
 'MAUT0': dtype('int64'),
 'MZFONDS': dtype('int64'),
 'MZPART': dtype('int64'),
 'MINKM30': dtype('int64'),
 'MINK3045': dtype('int64'),
 'MINK4575': dtype('int64'),
 'MINK7512': dtype('int64'),
 'MINK123M': dtype('int64'),
 'MINKGEM': dtype('int64'),
 'MKOOPKLA': dtype('int64'),
 'PWAPART': dtype('int64'),
 'PWABEDR': dtype('int64'),
 'PWALAND': dtype('int64'),
 'PPERSAUT': dtype('int64'),
 'PBESAUT': dtype('int64'),
 'PMOTSCO': dtype('int64'),
 'PVRAAUT': dtype('int64'),
 'PAANHANG': dtype('int64'),
 'PTRACTOR': dtype('int64'),
 'PWERKT': dtype('int64'),
 'PBROM': dtype('int64'),
 'PLEVEN': dtype('int64'),
 'PPERSONG': dtype('int64'),
 'PGEZONG': dtype('int64'),
 'PWAOREG': dtype('int64'),
 'PBRAND': dtype('int64'),
 'PZEILPL': dtype('int64'),
 'PPLEZIER': dtype('int64'),
 'PFIETS': dtype('int64'),
 'PINBOED': dtype('int64'),
 'PBYSTAND': dtype('int64'),
 'AWAPART': dtype('int64'),
 'AWABEDR': dtype('int64'),
 'AWALAND': dtype('int64'),
 'APERSAUT': dtype('int64'),
 'ABESAUT': dtype('int64'),
 'AMOTSCO': dtype('int64'),
 'AVRAAUT': dtype('int64'),
 'AAANHANG': dtype('int64'),
 'ATRACTOR': dtype('int64'),
 'AWERKT': dtype('int64'),
 'ABROM': dtype('int64'),
 'ALEVEN': dtype('int64'),
 'APERSONG': dtype('int64'),
 'AGEZONG': dtype('int64'),
 'AWAOREG': dtype('int64'),
 'ABRAND': dtype('int64'),
 'AZEILPL': dtype('int64'),
 'APLEZIER': dtype('int64'),
 'AFIETS': dtype('int64'),
 'AINBOED': dtype('int64'),
 'ABYSTAND': dtype('int64'),
 'CARAVAN': dtype('int64'),
 'TRAIN': dtype('int64')}
In [14]:
#We took the target variable, 'CARAVAN', potencial clients who search an insurance of caravans. 
#We look how many people don't have insurance and how many do.
df_data_caravan = df_ticdata['CARAVAN']\
        .value_counts(normalize=True)\
        .mul(100).rename('percent').reset_index()

df_data_caravan_conteo = df_ticdata['CARAVAN'].value_counts().reset_index()
df_data_caravan_pc = pd.merge(df_data_caravan, df_data_caravan_conteo, on=['index'], how='inner')
df_data_caravan_pc
Out[14]:
index percent CARAVAN
0 0 94.022673 5474
1 1 5.977327 348
In [15]:
#We will create a histogram on the percent of people who have or no insurance of caravans.
fig = px.histogram(df_data_caravan_pc, x="index", y=['percent'])
fig.show()

We can look that the 94.03% don´t have an insurance of caravans, versus a 5,97% that have it.

In [16]:
#Removed the people who have an insurance and we will continue work with that they don´t have it. 
df_ticdata_caravan = df_ticdata[(df_ticdata['CARAVAN']== 0)]
df_ticdata_caravan.shape
Out[16]:
(5474, 87)
In [17]:
#Comprobate if in our dataframe exists nulls values
df_data_null_columns = df_ticdata.isnull().sum().sort_values(ascending=False)
df_data_null_rows = df_ticdata.isnull().sum(axis=1).sort_values(ascending=False)
print(df_data_null_columns.shape, df_data_null_rows.shape)

df_null_columnas = pd.DataFrame(df_data_null_columns, columns=['nulls_columns'])     
df_null_filas = pd.DataFrame(df_data_null_rows, columns=['nulls_rows'])  
df_null_columnas['percent_columns'] = df_null_columnas['nulls_columns']/df_ticdata.shape[0]
df_null_filas['percent_rows']= df_null_filas['nulls_rows']/df_ticdata.shape[1]
(87,) (5822,)
In [18]:
#No exists null values. 
df_null_columnas
Out[18]:
nulls_columns percent_columns
MOSTYPE 0 0.0
PPERSONG 0 0.0
PBYSTAND 0 0.0
PINBOED 0 0.0
PFIETS 0 0.0
PPLEZIER 0 0.0
PZEILPL 0 0.0
PBRAND 0 0.0
PWAOREG 0 0.0
PGEZONG 0 0.0
PLEVEN 0 0.0
AWABEDR 0 0.0
PBROM 0 0.0
PWERKT 0 0.0
PTRACTOR 0 0.0
PAANHANG 0 0.0
PVRAAUT 0 0.0
PMOTSCO 0 0.0
PBESAUT 0 0.0
PPERSAUT 0 0.0
AWAPART 0 0.0
AWALAND 0 0.0
PWABEDR 0 0.0
AGEZONG 0 0.0
CARAVAN 0 0.0
ABYSTAND 0 0.0
AINBOED 0 0.0
AFIETS 0 0.0
APLEZIER 0 0.0
AZEILPL 0 0.0
ABRAND 0 0.0
AWAOREG 0 0.0
APERSONG 0 0.0
APERSAUT 0 0.0
ALEVEN 0 0.0
ABROM 0 0.0
AWERKT 0 0.0
ATRACTOR 0 0.0
AAANHANG 0 0.0
AVRAAUT 0 0.0
AMOTSCO 0 0.0
ABESAUT 0 0.0
PWALAND 0 0.0
PWAPART 0 0.0
MAANTHUI 0 0.0
MRELOV 0 0.0
MBERZELF 0 0.0
MBERHOOG 0 0.0
MOPLLAAG 0 0.0
MOPLMIDD 0 0.0
MOPLHOOG 0 0.0
MFWEKIND 0 0.0
MFGEKIND 0 0.0
MFALLEEN 0 0.0
MRELSA 0 0.0
MBERMIDD 0 0.0
MRELGE 0 0.0
MGODGE 0 0.0
MGODOV 0 0.0
MGODPR 0 0.0
MGODRK 0 0.0
MOSHOOFD 0 0.0
MGEMLEEF 0 0.0
MGEMOMV 0 0.0
MBERBOER 0 0.0
MBERARBG 0 0.0
MKOOPKLA 0 0.0
MAUT0 0 0.0
MINKGEM 0 0.0
MINK123M 0 0.0
MINK7512 0 0.0
MINK4575 0 0.0
MINK3045 0 0.0
MINKM30 0 0.0
MZPART 0 0.0
MZFONDS 0 0.0
MAUT2 0 0.0
MBERARBO 0 0.0
MAUT1 0 0.0
MHKOOP 0 0.0
MHHUUR 0 0.0
MSKD 0 0.0
MSKC 0 0.0
MSKB2 0 0.0
MSKB1 0 0.0
MSKA 0 0.0
TRAIN 0 0.0

Transform categorical variables into numerical¶

In [19]:
#The column 'MGEMLEEF' is a categorical variable who represents the age, we convert it into a numerical variable
#set the mean value of the values.

df_ticdata.loc[df_ticdata['MGEMLEEF'] == 1,'MGEMLEEF']=25
df_ticdata.loc[df_ticdata['MGEMLEEF'] == 2,'MGEMLEEF']=35
df_ticdata.loc[df_ticdata['MGEMLEEF'] == 3,'MGEMLEEF']=45
df_ticdata.loc[df_ticdata['MGEMLEEF'] == 4,'MGEMLEEF']=55
df_ticdata.loc[df_ticdata['MGEMLEEF'] == 5,'MGEMLEEF']=65
df_ticdata.loc[df_ticdata['MGEMLEEF'] == 6,'MGEMLEEF']=75
[df_ticdata['MGEMLEEF']]
Out[19]:
[0       35
 1       35
 2       35
 3       45
 4       35
         ..
 5817    35
 5818    55
 5819    55
 5820    35
 5821    45
 Name: MGEMLEEF, Length: 5822, dtype: int64]

Dataframe with only numerical variables

In [20]:
#We compare our target variable in the train and test. 

#train
plot_df_ticdata = df_ticdata['CARAVAN']\
        .value_counts(normalize=True)\
        .mul(100).rename('percent').reset_index()

plot_df_ticdata_conteo = df_ticdata['CARAVAN'].value_counts().reset_index()
plot_df_ticdata_pc = pd.merge(plot_df_ticdata, plot_df_ticdata_conteo, on=['index'], how='inner')


#test
plot_df_ticeval = df_ticeval['CARAVAN']\
        .value_counts(normalize=True)\
        .mul(100).rename('percent').reset_index()

plot_df_ticeval_conteo = df_ticeval['CARAVAN'].value_counts().reset_index()
plot_df_ticeval_pc = pd.merge(plot_df_ticeval, plot_df_ticeval_conteo, on=['index'], how='inner')


print(plot_df_ticdata_pc)
print(plot_df_ticeval_pc)
   index    percent  CARAVAN
0      0  94.022673     5474
1      1   5.977327      348
   index  percent  CARAVAN
0      0    94.05     3762
1      1     5.95      238
In [21]:
#We will create two histograms of train and test.
fig_train = px.histogram(plot_df_ticdata_pc, x="index", y=['percent'])
fig_train.show()
fig_test = px.histogram(plot_df_ticeval_pc, x="index", y=['percent'])
fig_test.show()
In [22]:
#We create a data frame with the categorical variables.
df_ticdata_categorical_variables = df_ticdata [["MOSTYPE", "MOSHOOFD", "MGODRK",
                        "MGODPR", "MGODOV", "MGODGE",
                        "MRELGE", "MRELSA", "MRELOV", "MFALLEEN",
                        "MFGEKIND", "MFWEKIND", "MOPLHOOG", "MOPLMIDD",
                        "MOPLLAAG", "MBERHOOG", "MBERZELF",
                        "MBERBOER", "MBERMIDD", "MBERARBG", "MBERARBO",
                        "MSKA","MSKB1", "MSKB2", "MSKC",
                        "MSKD", "MHHUUR", "MHKOOP", "MAUT1", "MAUT2",
                        "MAUT0", "MZFONDS", "MZPART", "MINKM30",
                        "MINK3045", "MINK4575", "MINK7512", "MINK123M",
                        "MINKGEM", "MKOOPKLA", "PWAPART", "PWABEDR", "PWALAND",
                        "PPERSAUT", "PBESAUT", "PMOTSCO", "PVRAAUT",
                        "PAANHANG", "PTRACTOR", "PWERKT", "PBROM",
                        "PLEVEN", "PPERSONG", "PGEZONG", "PWAOREG",
                        "PBRAND", "PZEILPL", "PPLEZIER", "PFIETS","PINBOED", "PBYSTAND","CARAVAN"]]

df_ticdata_categorical_variables.head()
Out[22]:
MOSTYPE MOSHOOFD MGODRK MGODPR MGODOV MGODGE MRELGE MRELSA MRELOV MFALLEEN MFGEKIND MFWEKIND MOPLHOOG MOPLMIDD MOPLLAAG MBERHOOG MBERZELF MBERBOER MBERMIDD MBERARBG MBERARBO MSKA MSKB1 MSKB2 MSKC MSKD MHHUUR MHKOOP MAUT1 MAUT2 MAUT0 MZFONDS MZPART MINKM30 MINK3045 MINK4575 MINK7512 MINK123M MINKGEM MKOOPKLA PWAPART PWABEDR PWALAND PPERSAUT PBESAUT PMOTSCO PVRAAUT PAANHANG PTRACTOR PWERKT PBROM PLEVEN PPERSONG PGEZONG PWAOREG PBRAND PZEILPL PPLEZIER PFIETS PINBOED PBYSTAND CARAVAN
0 33 8 0 5 1 3 7 0 2 1 2 6 1 2 7 1 0 1 2 5 2 1 1 2 6 1 1 8 8 0 1 8 1 0 4 5 0 0 4 3 0 0 0 6 0 0 0 0 0 0 0 0 0 0 0 5 0 0 0 0 0 0
1 37 8 1 4 1 4 6 2 2 0 4 5 0 5 4 0 0 0 5 0 4 0 2 3 5 0 2 7 7 1 2 6 3 2 0 5 2 0 5 4 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0
2 37 8 0 4 2 4 3 2 4 4 4 2 0 5 4 0 0 0 7 0 2 0 5 0 4 0 7 2 7 0 2 9 0 4 5 0 0 0 3 4 2 0 0 6 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0
3 9 3 2 3 2 4 5 2 2 2 3 4 3 4 2 4 0 0 3 1 2 3 2 1 4 0 5 4 9 0 0 7 2 1 5 3 0 0 4 4 0 0 0 6 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0
4 40 10 1 4 1 4 7 1 2 2 4 4 5 4 0 0 5 4 0 0 0 9 0 0 0 0 4 5 6 2 1 5 4 0 0 9 0 0 6 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 6 0 0 0 0 0 0
In [23]:
#We create a data frame with the numerical variables.
df_ticdata_numerical_variables = df_ticdata[["MAANTHUI", "MGEMOMV", "MGEMLEEF","AWAPART","AWABEDR", "AWALAND","APERSAUT","ABESAUT","AMOTSCO",
                            "AVRAAUT","AAANHANG","ATRACTOR","AWERKT","ABROM","ALEVEN","APERSONG","AGEZONG",
                            "AWAOREG","ABRAND","AZEILPL","APLEZIER", "AFIETS", "AINBOED", "ABYSTAND", "CARAVAN"]]
df_ticdata_numerical_variables.head()
Out[23]:
MAANTHUI MGEMOMV MGEMLEEF AWAPART AWABEDR AWALAND APERSAUT ABESAUT AMOTSCO AVRAAUT AAANHANG ATRACTOR AWERKT ABROM ALEVEN APERSONG AGEZONG AWAOREG ABRAND AZEILPL APLEZIER AFIETS AINBOED ABYSTAND CARAVAN
0 1 3 35 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
1 1 2 35 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
2 1 2 35 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
3 1 3 45 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
4 1 4 35 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0

Study of our variables in the dataframe TRAIN (ticdata)¶

In [24]:
def get_corr_matrix(dataset = None , metodo='spearman', size_figure=[10,8]):
    #To obtain the Spearman´s correlation, only change the method

    if dataset is None:
        print(u'\nNeed to add arguments to the function')
        return 1
    sns.set(style="white")
    # Compute the correlation matrix
    corr = dataset.corr(method=metodo) 
    # Set self-correlation to zero to avoid distraction
    for i in range(corr.shape[0]):
        corr.iloc[i, i] = 0
    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=size_figure)
    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr, center=0,
                square=True, linewidths=.5,  cmap ='viridis' ) #cbar_kws={"shrink": .5}
    plt.show()
In [25]:
#In the upper left side we have the correlation of the categorical variables and in the lower right side we have
#the correlation  of the policies.
get_corr_matrix(dataset = df_ticdata, size_figure=[10,8])
In [26]:
#We do the correlation matrix of the categorical variables.
get_corr_matrix(dataset = df_ticdata_categorical_variables, size_figure=[10,8])
In [27]:
#We do the correlation matrix of the numerical variables. 
get_corr_matrix(dataset = df_ticdata_numerical_variables, 
                metodo='pearson', size_figure=[10,8])
In [28]:
#We create a data frame with the policies.
df_ticdata_policies = df_ticdata[["PWAPART", "PWABEDR", "PWALAND",
                        "PPERSAUT", "PBESAUT", "PMOTSCO", "PVRAAUT",
                        "PAANHANG", "PTRACTOR", "PWERKT", "PBROM",
                        "PLEVEN", "PPERSONG", "PGEZONG", "PWAOREG",
                        "PBRAND", "PZEILPL", "PPLEZIER", "PFIETS","PINBOED", "PBYSTAND","AWAPART","AWABEDR",
                        "AWALAND","APERSAUT","ABESAUT","AMOTSCO","AVRAAUT","AAANHANG","ATRACTOR",
                         "AWERKT","ABROM","ALEVEN","APERSONG","AGEZONG","AWAOREG","ABRAND","AZEILPL","APLEZIER",
                         "AFIETS", "AINBOED", "ABYSTAND", "CARAVAN", "TRAIN"]]

get_corr_matrix(dataset = df_ticdata_policies, size_figure=[10,8])
In [29]:
df_ticdata_policies.corr(method='spearman').style.background_gradient(cmap='coolwarm')
Out[29]:
  PWAPART PWABEDR PWALAND PPERSAUT PBESAUT PMOTSCO PVRAAUT PAANHANG PTRACTOR PWERKT PBROM PLEVEN PPERSONG PGEZONG PWAOREG PBRAND PZEILPL PPLEZIER PFIETS PINBOED PBYSTAND AWAPART AWABEDR AWALAND APERSAUT ABESAUT AMOTSCO AVRAAUT AAANHANG ATRACTOR AWERKT ABROM ALEVEN APERSONG AGEZONG AWAOREG ABRAND AZEILPL APLEZIER AFIETS AINBOED ABYSTAND CARAVAN TRAIN
PWAPART 1.000000 -0.047163 -0.111101 0.158245 -0.040925 0.023527 -0.022796 -0.019500 -0.075141 -0.024934 -0.153253 0.139312 -0.010215 0.056989 -0.001413 0.512962 0.013126 -0.003985 -0.011382 0.042868 0.047596 0.989286 -0.047255 -0.111043 0.152571 -0.040878 0.023880 -0.022800 -0.019434 -0.075159 -0.024928 -0.152564 0.140592 -0.010157 0.056973 -0.001384 0.558860 0.013123 -0.004017 -0.011138 0.042850 0.047635 0.095332 nan
PWABEDR -0.047163 1.000000 0.033714 -0.011779 0.216533 -0.015888 0.144653 0.085160 0.075760 0.115155 -0.032271 0.019661 -0.008745 -0.009688 0.224897 0.081550 -0.002714 -0.009024 -0.019236 0.022890 -0.001808 -0.049170 0.999970 0.033807 -0.019640 0.216492 -0.016128 0.144678 0.085141 0.075443 0.115076 -0.032286 0.019100 -0.008745 -0.009688 0.224926 0.046095 -0.002714 -0.009024 -0.019235 0.022781 -0.001887 0.000573 nan
PWALAND -0.111101 0.033714 1.000000 0.079374 0.026701 -0.004038 -0.005708 0.099282 0.557020 0.152154 -0.010308 0.000528 0.039775 0.018207 0.049273 0.206860 -0.003294 0.004915 -0.015717 0.001078 0.003103 -0.111562 0.033855 0.999944 0.084595 0.026700 -0.003740 -0.005708 0.099085 0.557292 0.152128 -0.010112 0.000111 0.039732 0.018274 0.049310 0.130971 -0.003294 0.004987 -0.015763 0.001140 0.002974 -0.021325 nan
PPERSAUT 0.158245 -0.011779 0.079374 1.000000 0.019013 0.057908 0.010823 0.045688 0.080298 0.031518 -0.176222 0.070992 0.010389 0.046971 -0.001973 0.113067 -0.006312 0.036191 -0.036268 0.018230 0.091473 0.149409 -0.011997 0.079418 0.949831 0.019014 0.057821 0.010817 0.045691 0.080241 0.031550 -0.175535 0.071357 0.010432 0.046916 -0.001959 0.047518 -0.006317 0.036233 -0.036053 0.018161 0.091598 0.163670 nan
PBESAUT -0.040925 0.216533 0.026701 0.019013 1.000000 0.031521 0.238382 0.099526 0.047395 0.153853 -0.024618 0.022324 -0.006671 -0.007390 0.054893 0.021854 -0.002070 -0.006884 -0.002552 -0.008047 0.021293 -0.042272 0.214862 0.026826 0.018165 0.999994 0.031403 0.238391 0.099385 0.047097 0.153848 -0.024629 0.022626 -0.006671 -0.007390 0.054851 -0.005249 -0.002070 -0.006884 -0.002627 -0.008047 0.021374 -0.006945 nan
PMOTSCO 0.023527 -0.015888 -0.004038 0.057908 0.031521 1.000000 -0.007833 -0.003964 -0.003073 -0.011977 -0.043150 0.036764 0.022904 -0.005177 0.002173 0.010787 -0.004520 -0.002718 -0.014718 0.013142 0.022248 0.019111 -0.015964 -0.003981 0.060993 0.031393 0.999814 -0.007833 -0.003987 -0.003076 -0.011977 -0.043110 0.036714 0.022946 -0.005153 0.002163 -0.012002 -0.004520 -0.002743 -0.014679 0.013159 0.022319 0.009914 nan
PVRAAUT -0.022796 0.144653 -0.005708 0.010823 0.238382 -0.007833 1.000000 0.079573 0.051184 0.070785 -0.010624 -0.009054 -0.002879 -0.003189 0.067218 0.016459 -0.000893 -0.002971 0.021535 -0.003473 -0.004703 -0.023340 0.143625 -0.005708 0.010545 0.238184 -0.007834 1.000000 0.079446 0.050680 0.070672 -0.010629 -0.009055 -0.002879 -0.003189 0.067170 0.000005 -0.000893 -0.002971 0.021362 -0.003473 -0.004703 -0.009921 nan
PAANHANG -0.019500 0.085160 0.099282 0.045688 0.099526 -0.003964 0.079573 1.000000 0.078444 0.103235 -0.008929 0.004821 0.014703 0.011482 0.019654 0.048699 0.069743 0.035834 -0.006662 0.046318 -0.012700 -0.020626 0.084773 0.099720 0.036971 0.099597 -0.004010 0.079530 0.999989 0.078633 0.103349 -0.009300 0.005718 0.014730 0.011527 0.019636 0.030697 0.069755 0.035594 -0.006726 0.046351 -0.012700 0.014593 nan
PTRACTOR -0.075141 0.075760 0.557020 0.080298 0.047395 -0.003073 0.051184 0.078444 1.000000 0.215082 -0.006995 -0.005496 0.049669 0.015186 0.078890 0.168906 -0.003603 0.002618 -0.011351 -0.001531 0.019099 -0.075954 0.076020 0.556516 0.084904 0.047440 -0.002730 0.051194 0.078122 0.999956 0.215087 -0.007289 -0.006281 0.049647 0.015157 0.078898 0.100894 -0.003603 0.002633 -0.011438 -0.001475 0.018860 -0.016444 nan
PWERKT -0.024934 0.115155 0.152154 0.031518 0.153853 -0.011977 0.070785 0.103235 0.215082 1.000000 0.006295 -0.000076 0.074417 -0.004877 0.087669 0.057481 -0.001366 -0.004543 -0.009684 -0.005310 -0.007191 -0.025988 0.114593 0.152525 0.032054 0.153608 -0.011979 0.070747 0.103096 0.214361 0.999999 0.006422 -0.000999 0.074268 -0.004877 0.087606 0.030828 -0.001366 -0.004543 -0.009683 -0.005310 -0.007191 -0.015170 nan
PBROM -0.153253 -0.032271 -0.010308 -0.176222 -0.024618 -0.043150 -0.010624 -0.008929 -0.006995 0.006295 1.000000 -0.046376 -0.019755 -0.005067 -0.017005 -0.176993 -0.006131 -0.020386 -0.025951 -0.023830 -0.015362 -0.153855 -0.032272 -0.010319 -0.183393 -0.024618 -0.043182 -0.010624 -0.008924 -0.006887 0.006287 0.999482 -0.046699 -0.019755 -0.005085 -0.017005 -0.199225 -0.006131 -0.020386 -0.026058 -0.023831 -0.015368 -0.045199 nan
PLEVEN 0.139312 0.019661 0.000528 0.070992 0.022324 0.036764 -0.009054 0.004821 -0.005496 -0.000076 -0.046376 1.000000 0.037271 0.128552 -0.001879 0.130436 -0.005225 0.004139 -0.001883 0.025079 0.027143 0.135659 0.019363 0.000296 0.068048 0.022213 0.037025 -0.009054 0.004952 -0.005406 -0.000071 -0.046309 0.999237 0.037279 0.128624 -0.001888 0.105966 -0.005225 0.004059 -0.001979 0.025088 0.027114 0.018654 nan
PPERSONG -0.010215 -0.008745 0.039775 0.010389 -0.006671 0.022904 -0.002879 0.014703 0.049669 0.074417 -0.019755 0.037271 1.000000 -0.005930 -0.004608 0.013169 -0.001661 0.025725 -0.011775 -0.006457 -0.008745 -0.011943 -0.008745 0.039263 0.006398 -0.006671 0.022291 -0.002879 0.014638 0.049704 0.074539 -0.019764 0.036003 0.999996 -0.005930 -0.004608 0.010739 -0.001661 0.025865 -0.011775 -0.006457 -0.008745 -0.008504 nan
PGEZONG 0.056989 -0.009688 0.018207 0.046971 -0.007390 -0.005177 -0.003189 0.011482 0.015186 -0.004877 -0.005067 0.128552 -0.005930 1.000000 -0.005105 0.059531 -0.001840 -0.006120 0.014087 0.017177 0.117668 0.055236 -0.009688 0.018200 0.047005 -0.007390 -0.005046 -0.003189 0.011622 0.015181 -0.004877 -0.004976 0.127982 -0.005930 0.999995 -0.005105 0.043113 -0.001840 -0.006120 0.014678 0.017144 0.117187 0.033663 nan
PWAOREG -0.001413 0.224897 0.049273 -0.001973 0.054893 0.002173 0.067218 0.019654 0.078890 0.087669 -0.017005 -0.001879 -0.004608 -0.005105 1.000000 0.053754 -0.001430 -0.004755 -0.010136 -0.005558 -0.007527 -0.007033 0.224792 0.048680 -0.003710 0.054737 0.001731 0.067182 0.019650 0.078786 0.087559 -0.017012 -0.002230 -0.004608 -0.005105 0.999998 0.030579 -0.001430 -0.004755 -0.010135 -0.005558 -0.007527 0.030322 nan
PBRAND 0.512962 0.081550 0.206860 0.113067 0.021854 0.010787 0.016459 0.048699 0.168906 0.057481 -0.176993 0.130436 0.013169 0.059531 0.053754 1.000000 0.011413 0.014918 -0.037512 0.036000 0.056083 0.512606 0.081178 0.206762 0.093533 0.021870 0.011351 0.016463 0.048626 0.168998 0.057455 -0.176473 0.131198 0.013179 0.059521 0.053801 0.915881 0.011410 0.014869 -0.037218 0.035995 0.055778 0.100531 nan
PZEILPL 0.013126 -0.002714 -0.003294 -0.006312 -0.002070 -0.004520 -0.000893 0.069743 -0.003603 -0.001366 -0.006131 -0.005225 -0.001661 -0.001840 -0.001430 0.011413 1.000000 0.099603 -0.003654 0.084003 -0.002714 0.012227 -0.002714 -0.003294 -0.008585 -0.002070 -0.004521 -0.000893 0.069534 -0.003603 -0.001366 -0.006133 -0.005225 -0.001661 -0.001840 -0.001430 0.004889 1.000000 0.099049 -0.003654 0.084389 -0.002714 0.026196 nan
PPLEZIER -0.003985 -0.009024 0.004915 0.036191 -0.006884 -0.002718 -0.002971 0.035834 0.002618 -0.004543 -0.020386 0.004139 0.025725 -0.006120 -0.004755 0.014918 0.099603 1.000000 -0.012151 0.045675 0.010324 -0.005913 -0.009024 0.005073 0.034027 -0.006884 -0.003088 -0.002971 0.035706 0.002684 -0.004543 -0.020395 0.003188 0.025763 -0.006120 -0.004755 0.003014 0.099620 0.999995 -0.012151 0.045757 0.010421 0.106366 nan
PFIETS -0.011382 -0.019236 -0.015717 -0.036268 -0.002552 -0.014718 0.021535 -0.006662 -0.011351 -0.009684 -0.025951 -0.001883 -0.011775 0.014087 -0.010136 -0.037512 -0.003654 -0.012151 1.000000 0.010756 0.008598 -0.013701 -0.019237 -0.015642 -0.041116 -0.002583 -0.014926 0.021559 -0.006692 -0.011321 -0.009684 -0.026125 -0.001787 -0.011776 0.014148 -0.010136 -0.041336 -0.003654 -0.012152 0.999939 0.010795 0.008632 0.028695 nan
PINBOED 0.042868 0.022890 0.001078 0.018230 -0.008047 0.013142 -0.003473 0.046318 -0.001531 -0.005310 -0.023830 0.025079 -0.006457 0.017177 -0.005558 0.036000 0.084003 0.045675 0.010756 1.000000 0.023047 0.039557 0.022836 0.000937 0.016920 -0.008047 0.013069 -0.003473 0.046521 -0.001475 -0.005310 -0.023841 0.025053 -0.006457 0.017232 -0.005558 0.028702 0.084018 0.045478 0.010601 0.999992 0.022836 0.019017 nan
PBYSTAND 0.047596 -0.001808 0.003103 0.091473 0.021293 0.022248 -0.004703 -0.012700 0.019099 -0.007191 -0.015362 0.027143 -0.008745 0.117668 -0.007527 0.056083 -0.002714 0.010324 0.008598 0.023047 1.000000 0.044489 -0.001842 0.003306 0.097158 0.021211 0.021993 -0.004703 -0.012700 0.019121 -0.007191 -0.014957 0.025198 -0.008745 0.117479 -0.007527 0.037231 -0.002714 0.010394 0.008656 0.022938 0.999970 0.068132 nan
AWAPART 0.989286 -0.049170 -0.111562 0.149409 -0.042272 0.019111 -0.023340 -0.020626 -0.075954 -0.025988 -0.153855 0.135659 -0.011943 0.055236 -0.007033 0.512606 0.012227 -0.005913 -0.013701 0.039557 0.044489 1.000000 -0.049260 -0.111496 0.142575 -0.042225 0.019443 -0.023345 -0.020552 -0.075988 -0.025982 -0.153167 0.137124 -0.011886 0.055224 -0.007001 0.563030 0.012225 -0.005944 -0.013473 0.039539 0.044533 0.090000 nan
AWABEDR -0.047255 0.999970 0.033855 -0.011997 0.214862 -0.015964 0.143625 0.084773 0.076020 0.114593 -0.032272 0.019363 -0.008745 -0.009688 0.224792 0.081178 -0.002714 -0.009024 -0.019237 0.022836 -0.001842 -0.049260 1.000000 0.033947 -0.019819 0.214820 -0.016202 0.143651 0.084755 0.075702 0.114514 -0.032287 0.018806 -0.008745 -0.009688 0.224820 0.045909 -0.002714 -0.009024 -0.019235 0.022727 -0.001921 0.000601 nan
AWALAND -0.111043 0.033807 0.999944 0.079418 0.026826 -0.003981 -0.005708 0.099720 0.556516 0.152525 -0.010319 0.000296 0.039263 0.018200 0.048680 0.206762 -0.003294 0.005073 -0.015642 0.000937 0.003306 -0.111496 0.033947 1.000000 0.084538 0.026826 -0.003682 -0.005708 0.099523 0.556786 0.152498 -0.010123 -0.000122 0.039221 0.018267 0.048716 0.130983 -0.003294 0.005145 -0.015689 0.000998 0.003175 -0.021279 nan
APERSAUT 0.152571 -0.019640 0.084595 0.949831 0.018165 0.060993 0.010545 0.036971 0.084904 0.032054 -0.183393 0.068048 0.006398 0.047005 -0.003710 0.093533 -0.008585 0.034027 -0.041116 0.016920 0.097158 0.142575 -0.019819 0.084538 1.000000 0.018171 0.060946 0.010538 0.036983 0.084814 0.032087 -0.182709 0.068628 0.006429 0.046937 -0.003699 0.031569 -0.008590 0.034069 -0.040895 0.016865 0.097339 0.149490 nan
ABESAUT -0.040878 0.216492 0.026700 0.019014 0.999994 0.031393 0.238184 0.099597 0.047440 0.153608 -0.024618 0.022213 -0.006671 -0.007390 0.054737 0.021870 -0.002070 -0.006884 -0.002583 -0.008047 0.021211 -0.042225 0.214820 0.026826 0.018171 1.000000 0.031275 0.238192 0.099456 0.047142 0.153603 -0.024629 0.022514 -0.006671 -0.007390 0.054695 -0.005220 -0.002070 -0.006884 -0.002658 -0.008047 0.021291 -0.006986 nan
AMOTSCO 0.023880 -0.016128 -0.003740 0.057821 0.031403 0.999814 -0.007834 -0.004010 -0.002730 -0.011979 -0.043182 0.037025 0.022291 -0.005046 0.001731 0.011351 -0.004521 -0.003088 -0.014926 0.013069 0.021993 0.019443 -0.016202 -0.003682 0.060946 0.031275 1.000000 -0.007834 -0.004033 -0.002734 -0.011979 -0.043143 0.036988 0.022333 -0.005022 0.001721 -0.011608 -0.004521 -0.003113 -0.014889 0.013087 0.022062 0.010358 nan
AVRAAUT -0.022800 0.144678 -0.005708 0.010817 0.238391 -0.007833 1.000000 0.079530 0.051194 0.070747 -0.010624 -0.009054 -0.002879 -0.003189 0.067182 0.016463 -0.000893 -0.002971 0.021559 -0.003473 -0.004703 -0.023345 0.143651 -0.005708 0.010538 0.238192 -0.007834 1.000000 0.079403 0.050691 0.070635 -0.010629 -0.009055 -0.002879 -0.003189 0.067134 0.000006 -0.000893 -0.002971 0.021386 -0.003473 -0.004703 -0.009921 nan
AAANHANG -0.019434 0.085141 0.099085 0.045691 0.099385 -0.003987 0.079446 0.999989 0.078122 0.103096 -0.008924 0.004952 0.014638 0.011622 0.019650 0.048626 0.069534 0.035706 -0.006692 0.046521 -0.012700 -0.020552 0.084755 0.099523 0.036983 0.099456 -0.004033 0.079403 1.000000 0.078309 0.103210 -0.009293 0.005852 0.014665 0.011667 0.019632 0.030662 0.069546 0.035467 -0.006756 0.046553 -0.012700 0.014540 nan
ATRACTOR -0.075159 0.075443 0.557292 0.080241 0.047097 -0.003076 0.050680 0.078633 0.999956 0.214361 -0.006887 -0.005406 0.049704 0.015181 0.078786 0.168998 -0.003603 0.002684 -0.011321 -0.001475 0.019121 -0.075988 0.075702 0.556786 0.084814 0.047142 -0.002734 0.050691 0.078309 1.000000 0.214364 -0.007182 -0.006192 0.049682 0.015152 0.078794 0.101040 -0.003603 0.002699 -0.011409 -0.001419 0.018881 -0.016650 nan
AWERKT -0.024928 0.115076 0.152128 0.031550 0.153848 -0.011977 0.070672 0.103349 0.215087 0.999999 0.006287 -0.000071 0.074539 -0.004877 0.087559 0.057455 -0.001366 -0.004543 -0.009684 -0.005310 -0.007191 -0.025982 0.114514 0.152498 0.032087 0.153603 -0.011979 0.070635 0.103210 0.214364 1.000000 0.006414 -0.000995 0.074390 -0.004877 0.087496 0.030810 -0.001366 -0.004543 -0.009683 -0.005310 -0.007191 -0.015170 nan
ABROM -0.152564 -0.032286 -0.010112 -0.175535 -0.024629 -0.043110 -0.010629 -0.009300 -0.007289 0.006422 0.999482 -0.046309 -0.019764 -0.004976 -0.017012 -0.176473 -0.006133 -0.020395 -0.026125 -0.023841 -0.014957 -0.153167 -0.032287 -0.010123 -0.182709 -0.024629 -0.043143 -0.010629 -0.009293 -0.007182 0.006414 1.000000 -0.046634 -0.019764 -0.004994 -0.017012 -0.198651 -0.006133 -0.020395 -0.026231 -0.023841 -0.014965 -0.045145 nan
ALEVEN 0.140592 0.019100 0.000111 0.071357 0.022626 0.036714 -0.009055 0.005718 -0.006281 -0.000999 -0.046699 0.999237 0.036003 0.127982 -0.002230 0.131198 -0.005225 0.003188 -0.001787 0.025053 0.025198 0.137124 0.018806 -0.000122 0.068628 0.022514 0.036988 -0.009055 0.005852 -0.006192 -0.000995 -0.046634 1.000000 0.036012 0.128055 -0.002239 0.106750 -0.005225 0.003111 -0.001877 0.025061 0.025169 0.019285 nan
APERSONG -0.010157 -0.008745 0.039732 0.010432 -0.006671 0.022946 -0.002879 0.014730 0.049647 0.074268 -0.019755 0.037279 0.999996 -0.005930 -0.004608 0.013179 -0.001661 0.025763 -0.011776 -0.006457 -0.008745 -0.011886 -0.008745 0.039221 0.006429 -0.006671 0.022333 -0.002879 0.014665 0.049682 0.074390 -0.019764 0.036012 1.000000 -0.005930 -0.004608 0.010772 -0.001661 0.025903 -0.011775 -0.006457 -0.008745 -0.008492 nan
AGEZONG 0.056973 -0.009688 0.018274 0.046916 -0.007390 -0.005153 -0.003189 0.011527 0.015157 -0.004877 -0.005085 0.128624 -0.005930 0.999995 -0.005105 0.059521 -0.001840 -0.006120 0.014148 0.017232 0.117479 0.055224 -0.009688 0.018267 0.046937 -0.007390 -0.005022 -0.003189 0.011667 0.015152 -0.004877 -0.004994 0.128055 -0.005930 1.000000 -0.005105 0.043093 -0.001840 -0.006120 0.014740 0.017198 0.116997 0.033548 nan
AWAOREG -0.001384 0.224926 0.049310 -0.001959 0.054851 0.002163 0.067170 0.019636 0.078898 0.087606 -0.017005 -0.001888 -0.004608 -0.005105 0.999998 0.053801 -0.001430 -0.004755 -0.010136 -0.005558 -0.007527 -0.007001 0.224820 0.048716 -0.003699 0.054695 0.001721 0.067134 0.019632 0.078794 0.087496 -0.017012 -0.002239 -0.004608 -0.005105 1.000000 0.030626 -0.001430 -0.004755 -0.010135 -0.005558 -0.007527 0.030290 nan
ABRAND 0.558860 0.046095 0.130971 0.047518 -0.005249 -0.012002 0.000005 0.030697 0.100894 0.030828 -0.199225 0.105966 0.010739 0.043113 0.030579 0.915881 0.004889 0.003014 -0.041336 0.028702 0.037231 0.563030 0.045909 0.130983 0.031569 -0.005220 -0.011608 0.000006 0.030662 0.101040 0.030810 -0.198651 0.106750 0.010772 0.043093 0.030626 1.000000 0.004887 0.002976 -0.041102 0.028684 0.036986 0.069493 nan
AZEILPL 0.013123 -0.002714 -0.003294 -0.006317 -0.002070 -0.004520 -0.000893 0.069755 -0.003603 -0.001366 -0.006131 -0.005225 -0.001661 -0.001840 -0.001430 0.011410 1.000000 0.099620 -0.003654 0.084018 -0.002714 0.012225 -0.002714 -0.003294 -0.008590 -0.002070 -0.004521 -0.000893 0.069546 -0.003603 -0.001366 -0.006133 -0.005225 -0.001661 -0.001840 -0.001430 0.004887 1.000000 0.099067 -0.003654 0.084404 -0.002714 0.026201 nan
APLEZIER -0.004017 -0.009024 0.004987 0.036233 -0.006884 -0.002743 -0.002971 0.035594 0.002633 -0.004543 -0.020386 0.004059 0.025865 -0.006120 -0.004755 0.014869 0.099049 0.999995 -0.012152 0.045478 0.010394 -0.005944 -0.009024 0.005145 0.034069 -0.006884 -0.003113 -0.002971 0.035467 0.002699 -0.004543 -0.020395 0.003111 0.025903 -0.006120 -0.004755 0.002976 0.099067 1.000000 -0.012151 0.045559 0.010491 0.106438 nan
AFIETS -0.011138 -0.019235 -0.015763 -0.036053 -0.002627 -0.014679 0.021362 -0.006726 -0.011438 -0.009683 -0.026058 -0.001979 -0.011775 0.014678 -0.010135 -0.037218 -0.003654 -0.012151 0.999939 0.010601 0.008656 -0.013473 -0.019235 -0.015689 -0.040895 -0.002658 -0.014889 0.021386 -0.006756 -0.011409 -0.009683 -0.026231 -0.001877 -0.011775 0.014740 -0.010135 -0.041102 -0.003654 -0.012151 1.000000 0.010639 0.008691 0.028869 nan
AINBOED 0.042850 0.022781 0.001140 0.018161 -0.008047 0.013159 -0.003473 0.046351 -0.001475 -0.005310 -0.023831 0.025088 -0.006457 0.017144 -0.005558 0.035995 0.084389 0.045757 0.010795 0.999992 0.022938 0.039539 0.022727 0.000998 0.016865 -0.008047 0.013087 -0.003473 0.046553 -0.001419 -0.005310 -0.023841 0.025061 -0.006457 0.017198 -0.005558 0.028684 0.084404 0.045559 0.010639 1.000000 0.022727 0.019106 nan
ABYSTAND 0.047635 -0.001887 0.002974 0.091598 0.021374 0.022319 -0.004703 -0.012700 0.018860 -0.007191 -0.015368 0.027114 -0.008745 0.117187 -0.007527 0.055778 -0.002714 0.010421 0.008632 0.022836 0.999970 0.044533 -0.001921 0.003175 0.097339 0.021291 0.022062 -0.004703 -0.012700 0.018881 -0.007191 -0.014965 0.025169 -0.008745 0.116997 -0.007527 0.036986 -0.002714 0.010491 0.008691 0.022727 1.000000 0.068222 nan
CARAVAN 0.095332 0.000573 -0.021325 0.163670 -0.006945 0.009914 -0.009921 0.014593 -0.016444 -0.015170 -0.045199 0.018654 -0.008504 0.033663 0.030322 0.100531 0.026196 0.106366 0.028695 0.019017 0.068132 0.090000 0.000601 -0.021279 0.149490 -0.006986 0.010358 -0.009921 0.014540 -0.016650 -0.015170 -0.045145 0.019285 -0.008492 0.033548 0.030290 0.069493 0.026201 0.106438 0.028869 0.019106 0.068222 1.000000 nan
TRAIN nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan

Study of categorical variables with Cramer ´s V¶

In [30]:
def cramers_v(var1,var2):
 
    crosstab =np.array(pd.crosstab(var1,var2, rownames=None, colnames=None))
    chi2 = ss.chi2_contingency(crosstab)[0]
    n = crosstab.sum()
    phi2 = chi2 / n
    r, k = crosstab.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1),(rcorr-1)))

We use Cramer´s V to observ categorical variables that we considered be highly correlated, based on the descripcion of our dictionary.

In [31]:
cramers_v(df_ticdata_categorical_variables["MOSTYPE"], df_ticdata_categorical_variables["MOSHOOFD"])
Out[31]:
0.9974157937677048
In [32]:
#Average income  vs Purchasing power class

cramers_v(df_ticdata_categorical_variables["MINKGEM"], df_ticdata_categorical_variables["MKOOPKLA"])
Out[32]:
0.22021007250782393
In [33]:
# High status vs income > 123.000

cramers_v(df_ticdata_categorical_variables["MBERHOOG"], df_ticdata_categorical_variables["MINK123M"])
Out[33]:
0.16893247903544156
In [34]:
# High level of education vs income > 123.000

cramers_v(df_ticdata_categorical_variables["MOPLHOOG"], df_ticdata_categorical_variables["MINK123M"])
Out[34]:
0.1991300006594408
In [35]:
# social class D vs Unskilled labours

cramers_v(df_ticdata_categorical_variables["MSKD"], df_ticdata_categorical_variables["MBERARBO"])
Out[35]:
0.26855759802471696

After comprobate some variables, we understood could be highly correlated, we look what only Customer subtype and Customer main type. For a future model we could rescind one of them.

In [36]:
rows = []

for var1 in df_ticdata_categorical_variables:
    col = []
    for var2 in df_ticdata_categorical_variables:
        cramers = cramers_v(df_ticdata_categorical_variables[var1], df_ticdata_categorical_variables[var2])
        col.append(round(cramers, 2))
    rows.append(col)
    
cramers_results = np.array(rows)
df_vcramer = pd.DataFrame(cramers_results, columns = df_ticdata_categorical_variables.columns, index = df_ticdata_categorical_variables.columns)

df_vcramer
Out[36]:
MOSTYPE MOSHOOFD MGODRK MGODPR MGODOV MGODGE MRELGE MRELSA MRELOV MFALLEEN MFGEKIND MFWEKIND MOPLHOOG MOPLMIDD MOPLLAAG MBERHOOG MBERZELF MBERBOER MBERMIDD MBERARBG MBERARBO MSKA MSKB1 MSKB2 MSKC MSKD MHHUUR MHKOOP MAUT1 MAUT2 MAUT0 MZFONDS MZPART MINKM30 MINK3045 MINK4575 MINK7512 MINK123M MINKGEM MKOOPKLA PWAPART PWABEDR PWALAND PPERSAUT PBESAUT PMOTSCO PVRAAUT PAANHANG PTRACTOR PWERKT PBROM PLEVEN PPERSONG PGEZONG PWAOREG PBRAND PZEILPL PPLEZIER PFIETS PINBOED PBYSTAND CARAVAN
MOSTYPE 1.00 1.00 0.16 0.19 0.21 0.19 0.28 0.20 0.29 0.28 0.18 0.27 0.29 0.22 0.30 0.26 0.30 0.33 0.21 0.23 0.23 0.27 0.18 0.21 0.25 0.22 0.29 0.29 0.24 0.24 0.28 0.26 0.26 0.23 0.21 0.25 0.16 0.14 0.26 0.95 0.09 0.02 0.10 0.04 0.05 0.05 0.00 0.07 0.12 0.08 0.08 0.00 0.00 0.02 0.05 0.17 0.00 0.02 0.05 0.03 0.04 0.12
MOSHOOFD 1.00 1.00 0.09 0.11 0.12 0.10 0.21 0.14 0.20 0.20 0.11 0.20 0.21 0.15 0.22 0.18 0.12 0.22 0.13 0.16 0.17 0.19 0.12 0.13 0.19 0.15 0.21 0.21 0.15 0.14 0.18 0.19 0.19 0.17 0.13 0.18 0.10 0.08 0.20 0.61 0.07 0.02 0.11 0.05 0.00 0.03 0.01 0.04 0.13 0.07 0.04 0.04 0.02 0.03 0.00 0.16 0.03 0.00 0.03 0.02 0.05 0.12
MGODRK 0.16 0.09 1.00 0.22 0.17 0.12 0.10 0.17 0.10 0.07 0.07 0.08 0.12 0.09 0.11 0.12 0.16 0.06 0.09 0.09 0.08 0.10 0.09 0.08 0.10 0.07 0.09 0.09 0.13 0.14 0.12 0.14 0.14 0.10 0.08 0.08 0.14 0.13 0.13 0.10 0.02 0.00 0.01 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.01 0.00 0.00 0.00 0.00 0.01 0.00 0.00 0.02 0.00 0.00 0.00
MGODPR 0.19 0.11 0.22 1.00 0.25 0.43 0.14 0.17 0.13 0.10 0.11 0.11 0.11 0.11 0.11 0.13 0.14 0.11 0.13 0.12 0.12 0.11 0.09 0.12 0.11 0.12 0.14 0.14 0.15 0.16 0.15 0.12 0.12 0.12 0.10 0.11 0.08 0.08 0.12 0.10 0.00 0.00 0.04 0.06 0.00 0.00 0.00 0.00 0.02 0.03 0.02 0.00 0.01 0.00 0.00 0.06 0.00 0.02 0.00 0.00 0.02 0.05
MGODOV 0.21 0.12 0.17 0.25 1.00 0.16 0.18 0.24 0.20 0.14 0.13 0.14 0.12 0.12 0.10 0.15 0.12 0.20 0.13 0.15 0.13 0.15 0.12 0.13 0.11 0.13 0.16 0.16 0.16 0.14 0.20 0.16 0.16 0.11 0.12 0.15 0.08 0.09 0.10 0.09 0.02 0.00 0.02 0.01 0.00 0.00 0.00 0.00 0.04 0.02 0.02 0.03 0.00 0.00 0.00 0.03 0.00 0.00 0.02 0.00 0.00 0.03
MGODGE 0.19 0.10 0.12 0.43 0.16 1.00 0.14 0.16 0.14 0.09 0.12 0.11 0.09 0.10 0.10 0.13 0.14 0.10 0.11 0.11 0.10 0.10 0.08 0.10 0.10 0.11 0.14 0.14 0.12 0.11 0.13 0.14 0.14 0.12 0.09 0.11 0.04 0.09 0.11 0.09 0.00 0.00 0.03 0.01 0.00 0.00 0.00 0.00 0.01 0.00 0.00 0.00 0.02 0.00 0.01 0.05 0.00 0.00 0.04 0.00 0.01 0.04
MRELGE 0.28 0.21 0.10 0.14 0.18 0.14 1.00 0.31 0.61 0.35 0.12 0.25 0.12 0.12 0.14 0.12 0.10 0.11 0.11 0.12 0.17 0.11 0.12 0.10 0.11 0.16 0.20 0.20 0.24 0.16 0.31 0.18 0.18 0.21 0.12 0.15 0.10 0.07 0.19 0.17 0.04 0.00 0.02 0.02 0.00 0.03 0.00 0.00 0.02 0.00 0.00 0.01 0.00 0.01 0.00 0.07 0.00 0.02 0.04 0.03 0.02 0.06
MRELSA 0.20 0.14 0.17 0.17 0.24 0.16 0.31 1.00 0.16 0.13 0.14 0.12 0.09 0.10 0.09 0.12 0.14 0.09 0.12 0.11 0.12 0.11 0.12 0.10 0.10 0.11 0.15 0.14 0.13 0.14 0.16 0.17 0.17 0.10 0.09 0.11 0.09 0.07 0.08 0.08 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.03 0.00 0.00 0.02 0.00 0.00 0.01
MRELOV 0.29 0.20 0.10 0.13 0.20 0.14 0.61 0.16 1.00 0.46 0.13 0.22 0.10 0.10 0.13 0.11 0.12 0.10 0.11 0.12 0.17 0.10 0.09 0.12 0.11 0.18 0.19 0.19 0.31 0.18 0.39 0.16 0.16 0.23 0.12 0.14 0.09 0.07 0.23 0.17 0.04 0.00 0.04 0.02 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 0.00 0.06 0.00 0.00 0.06 0.02 0.00 0.06
MFALLEEN 0.28 0.20 0.07 0.10 0.14 0.09 0.35 0.13 0.46 1.00 0.18 0.32 0.14 0.12 0.14 0.12 0.13 0.10 0.11 0.13 0.12 0.12 0.18 0.16 0.12 0.18 0.19 0.19 0.21 0.12 0.28 0.12 0.12 0.23 0.13 0.15 0.10 0.08 0.20 0.16 0.04 0.00 0.01 0.01 0.01 0.00 0.00 0.00 0.02 0.00 0.00 0.00 0.00 0.01 0.00 0.07 0.00 0.00 0.00 0.00 0.02 0.04
MFGEKIND 0.18 0.11 0.07 0.11 0.13 0.12 0.12 0.14 0.13 0.18 1.00 0.44 0.11 0.18 0.15 0.12 0.13 0.08 0.16 0.12 0.11 0.16 0.16 0.16 0.12 0.11 0.14 0.14 0.12 0.10 0.10 0.10 0.10 0.13 0.14 0.12 0.08 0.09 0.13 0.11 0.00 0.00 0.00 0.01 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.05 0.00 0.03 0.00 0.00 0.02 0.00 0.03 0.00
MFWEKIND 0.27 0.20 0.08 0.11 0.14 0.11 0.25 0.12 0.22 0.32 0.44 1.00 0.10 0.14 0.14 0.11 0.10 0.11 0.14 0.13 0.11 0.12 0.14 0.15 0.11 0.14 0.17 0.17 0.16 0.13 0.18 0.09 0.09 0.16 0.13 0.15 0.09 0.08 0.13 0.16 0.04 0.01 0.03 0.03 0.00 0.00 0.00 0.00 0.03 0.00 0.02 0.00 0.00 0.02 0.03 0.07 0.00 0.00 0.03 0.02 0.01 0.02
MOPLHOOG 0.29 0.21 0.12 0.11 0.12 0.09 0.12 0.09 0.10 0.14 0.11 0.10 1.00 0.17 0.29 0.31 0.26 0.12 0.15 0.18 0.16 0.37 0.20 0.16 0.24 0.15 0.18 0.18 0.11 0.12 0.13 0.25 0.24 0.16 0.17 0.20 0.21 0.20 0.24 0.21 0.03 0.00 0.02 0.00 0.00 0.00 0.00 0.00 0.02 0.00 0.03 0.00 0.00 0.00 0.00 0.03 0.00 0.00 0.06 0.03 0.03 0.08
MOPLMIDD 0.22 0.15 0.09 0.11 0.12 0.10 0.12 0.10 0.10 0.12 0.18 0.14 0.17 1.00 0.54 0.14 0.14 0.12 0.25 0.19 0.13 0.16 0.24 0.22 0.22 0.19 0.15 0.15 0.11 0.10 0.09 0.13 0.13 0.13 0.16 0.14 0.09 0.07 0.13 0.13 0.01 0.00 0.00 0.03 0.02 0.00 0.00 0.00 0.02 0.00 0.02 0.04 0.00 0.04 0.00 0.03 0.00 0.00 0.04 0.00 0.03 0.04
MOPLLAAG 0.30 0.22 0.11 0.11 0.10 0.10 0.14 0.09 0.13 0.14 0.15 0.14 0.29 0.54 1.00 0.26 0.19 0.14 0.21 0.25 0.21 0.31 0.24 0.21 0.39 0.23 0.20 0.19 0.13 0.12 0.13 0.22 0.22 0.18 0.17 0.20 0.14 0.11 0.21 0.22 0.04 0.00 0.03 0.02 0.00 0.00 0.00 0.00 0.03 0.00 0.01 0.03 0.00 0.04 0.00 0.04 0.00 0.00 0.04 0.03 0.02 0.09
MBERHOOG 0.26 0.18 0.12 0.13 0.15 0.13 0.12 0.12 0.11 0.12 0.12 0.11 0.31 0.14 0.26 1.00 0.18 0.11 0.18 0.20 0.21 0.49 0.17 0.19 0.25 0.15 0.18 0.18 0.14 0.17 0.15 0.30 0.30 0.15 0.13 0.21 0.18 0.17 0.22 0.21 0.04 0.02 0.01 0.02 0.00 0.00 0.00 0.00 0.01 0.00 0.00 0.02 0.00 0.03 0.02 0.04 0.04 0.00 0.05 0.04 0.00 0.07
MBERZELF 0.30 0.12 0.16 0.14 0.12 0.14 0.10 0.14 0.12 0.13 0.13 0.10 0.26 0.14 0.19 0.18 1.00 0.25 0.16 0.16 0.17 0.34 0.14 0.14 0.16 0.17 0.16 0.16 0.12 0.18 0.13 0.18 0.18 0.15 0.14 0.23 0.19 0.17 0.20 0.10 0.01 0.00 0.00 0.02 0.00 0.00 0.00 0.00 0.01 0.02 0.00 0.03 0.00 0.00 0.00 0.08 0.03 0.02 0.02 0.00 0.03 0.00
MBERBOER 0.33 0.22 0.06 0.11 0.20 0.10 0.11 0.09 0.10 0.10 0.08 0.11 0.12 0.12 0.14 0.11 0.25 1.00 0.14 0.11 0.12 0.18 0.09 0.17 0.13 0.14 0.15 0.15 0.12 0.14 0.11 0.13 0.13 0.11 0.11 0.15 0.09 0.06 0.09 0.12 0.04 0.02 0.12 0.02 0.05 0.00 0.02 0.06 0.14 0.13 0.03 0.02 0.03 0.00 0.00 0.09 0.07 0.00 0.04 0.00 0.00 0.04
MBERMIDD 0.21 0.13 0.09 0.13 0.13 0.11 0.11 0.12 0.11 0.11 0.16 0.14 0.15 0.25 0.21 0.18 0.16 0.14 1.00 0.21 0.20 0.16 0.29 0.17 0.17 0.15 0.13 0.13 0.15 0.13 0.12 0.14 0.14 0.13 0.15 0.12 0.10 0.10 0.14 0.12 0.00 0.00 0.02 0.02 0.00 0.02 0.00 0.00 0.03 0.02 0.00 0.00 0.00 0.00 0.03 0.04 0.00 0.00 0.02 0.00 0.02 0.05
MBERARBG 0.23 0.16 0.09 0.12 0.15 0.11 0.12 0.11 0.12 0.13 0.12 0.13 0.18 0.19 0.25 0.20 0.16 0.11 0.21 1.00 0.17 0.18 0.16 0.15 0.33 0.15 0.16 0.16 0.12 0.14 0.12 0.18 0.18 0.13 0.13 0.16 0.14 0.12 0.14 0.15 0.02 0.00 0.00 0.03 0.00 0.00 0.00 0.00 0.01 0.00 0.00 0.00 0.00 0.01 0.00 0.03 0.00 0.00 0.04 0.00 0.00 0.06
MBERARBO 0.23 0.17 0.08 0.12 0.13 0.10 0.17 0.12 0.17 0.12 0.11 0.11 0.16 0.13 0.21 0.21 0.17 0.12 0.20 0.17 1.00 0.19 0.13 0.14 0.21 0.27 0.19 0.18 0.15 0.12 0.17 0.17 0.17 0.20 0.14 0.15 0.13 0.11 0.17 0.17 0.01 0.00 0.00 0.01 0.02 0.00 0.00 0.00 0.02 0.00 0.01 0.00 0.00 0.00 0.00 0.03 0.00 0.00 0.06 0.00 0.02 0.05
MSKA 0.27 0.19 0.10 0.11 0.15 0.10 0.11 0.11 0.10 0.12 0.16 0.12 0.37 0.16 0.31 0.49 0.34 0.18 0.16 0.18 0.19 1.00 0.15 0.16 0.27 0.16 0.19 0.19 0.11 0.14 0.14 0.29 0.29 0.15 0.17 0.26 0.18 0.16 0.22 0.18 0.03 0.00 0.02 0.02 0.00 0.00 0.00 0.00 0.02 0.00 0.01 0.03 0.00 0.01 0.02 0.05 0.05 0.00 0.04 0.05 0.04 0.09
MSKB1 0.18 0.12 0.09 0.09 0.12 0.08 0.12 0.12 0.09 0.18 0.16 0.14 0.20 0.24 0.24 0.17 0.14 0.09 0.29 0.16 0.13 0.15 1.00 0.15 0.18 0.17 0.11 0.11 0.09 0.09 0.08 0.12 0.12 0.12 0.12 0.17 0.11 0.16 0.12 0.12 0.01 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.04 0.00 0.03 0.00 0.00 0.04 0.03 0.00 0.05
MSKB2 0.21 0.13 0.08 0.12 0.13 0.10 0.10 0.10 0.12 0.16 0.16 0.15 0.16 0.22 0.21 0.19 0.14 0.17 0.17 0.15 0.14 0.16 0.15 1.00 0.24 0.17 0.15 0.15 0.10 0.10 0.12 0.12 0.12 0.13 0.14 0.15 0.12 0.09 0.13 0.10 0.00 0.06 0.02 0.01 0.00 0.00 0.00 0.00 0.06 0.00 0.00 0.01 0.00 0.00 0.25 0.03 0.02 0.00 0.00 0.00 0.00 0.00
MSKC 0.25 0.19 0.10 0.11 0.11 0.10 0.11 0.10 0.11 0.12 0.12 0.11 0.24 0.22 0.39 0.25 0.16 0.13 0.17 0.33 0.21 0.27 0.18 0.24 1.00 0.16 0.19 0.20 0.11 0.12 0.12 0.22 0.22 0.16 0.17 0.17 0.14 0.11 0.18 0.18 0.02 0.00 0.00 0.02 0.00 0.00 0.00 0.00 0.02 0.00 0.00 0.00 0.00 0.02 0.00 0.03 0.00 0.02 0.01 0.02 0.00 0.07
MSKD 0.22 0.15 0.07 0.12 0.13 0.11 0.16 0.11 0.18 0.18 0.11 0.14 0.15 0.19 0.23 0.15 0.17 0.14 0.15 0.15 0.27 0.16 0.17 0.17 0.16 1.00 0.17 0.17 0.18 0.10 0.21 0.16 0.16 0.18 0.14 0.17 0.11 0.11 0.16 0.16 0.03 0.00 0.00 0.00 0.01 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 0.00 0.00 0.04 0.00 0.04 0.00 0.00 0.00 0.06
MHHUUR 0.29 0.21 0.09 0.14 0.16 0.14 0.20 0.15 0.19 0.19 0.14 0.17 0.18 0.15 0.20 0.18 0.16 0.15 0.13 0.16 0.19 0.19 0.11 0.15 0.19 0.17 1.00 0.99 0.16 0.19 0.18 0.20 0.20 0.22 0.11 0.19 0.15 0.11 0.19 0.21 0.03 0.02 0.04 0.04 0.00 0.00 0.01 0.01 0.05 0.00 0.00 0.01 0.00 0.03 0.02 0.11 0.00 0.00 0.03 0.00 0.03 0.08
MHKOOP 0.29 0.21 0.09 0.14 0.16 0.14 0.20 0.14 0.19 0.19 0.14 0.17 0.18 0.15 0.19 0.18 0.16 0.15 0.13 0.16 0.18 0.19 0.11 0.15 0.20 0.17 0.99 1.00 0.16 0.19 0.18 0.20 0.20 0.22 0.11 0.19 0.15 0.11 0.19 0.21 0.03 0.02 0.04 0.04 0.00 0.00 0.01 0.01 0.05 0.00 0.00 0.01 0.00 0.03 0.02 0.11 0.00 0.00 0.03 0.00 0.03 0.08
MAUT1 0.24 0.15 0.13 0.15 0.16 0.12 0.24 0.13 0.31 0.21 0.12 0.16 0.11 0.11 0.13 0.14 0.12 0.12 0.15 0.12 0.15 0.11 0.09 0.10 0.11 0.18 0.16 0.16 1.00 0.29 0.64 0.14 0.14 0.20 0.12 0.14 0.08 0.06 0.17 0.13 0.04 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 0.02 0.00 0.00 0.00 0.02 0.00 0.04 0.00 0.04 0.05 0.00 0.01 0.07
MAUT2 0.24 0.14 0.14 0.16 0.14 0.11 0.16 0.14 0.18 0.12 0.10 0.13 0.12 0.10 0.12 0.17 0.18 0.14 0.13 0.14 0.12 0.14 0.09 0.10 0.12 0.10 0.19 0.19 0.29 1.00 0.17 0.21 0.21 0.12 0.10 0.11 0.16 0.14 0.15 0.12 0.03 0.00 0.02 0.01 0.00 0.00 0.00 0.02 0.04 0.00 0.00 0.01 0.00 0.00 0.00 0.06 0.09 0.03 0.00 0.00 0.00 0.00
MAUT0 0.28 0.18 0.12 0.15 0.20 0.13 0.31 0.16 0.39 0.28 0.10 0.18 0.13 0.09 0.13 0.15 0.13 0.11 0.12 0.12 0.17 0.14 0.08 0.12 0.12 0.21 0.18 0.18 0.64 0.17 1.00 0.21 0.21 0.24 0.12 0.15 0.10 0.08 0.20 0.18 0.05 0.00 0.02 0.02 0.00 0.00 0.00 0.02 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.07 0.00 0.03 0.04 0.00 0.03 0.08
MZFONDS 0.26 0.19 0.14 0.12 0.16 0.14 0.18 0.17 0.16 0.12 0.10 0.09 0.25 0.13 0.22 0.30 0.18 0.13 0.14 0.18 0.17 0.29 0.12 0.12 0.22 0.16 0.20 0.20 0.14 0.21 0.21 1.00 0.99 0.15 0.15 0.19 0.17 0.13 0.20 0.19 0.05 0.00 0.03 0.02 0.00 0.00 0.00 0.00 0.02 0.00 0.00 0.02 0.02 0.02 0.00 0.05 0.04 0.00 0.02 0.06 0.03 0.05
MZPART 0.26 0.19 0.14 0.12 0.16 0.14 0.18 0.17 0.16 0.12 0.10 0.09 0.24 0.13 0.22 0.30 0.18 0.13 0.14 0.18 0.17 0.29 0.12 0.12 0.22 0.16 0.20 0.20 0.14 0.21 0.21 0.99 1.00 0.15 0.14 0.19 0.17 0.13 0.21 0.19 0.05 0.00 0.02 0.02 0.00 0.00 0.00 0.00 0.02 0.00 0.00 0.02 0.02 0.01 0.00 0.05 0.04 0.00 0.02 0.06 0.03 0.05
MINKM30 0.23 0.17 0.10 0.12 0.11 0.12 0.21 0.10 0.23 0.23 0.13 0.16 0.16 0.13 0.18 0.15 0.15 0.11 0.13 0.13 0.20 0.15 0.12 0.13 0.16 0.18 0.22 0.22 0.20 0.12 0.24 0.15 0.15 1.00 0.27 0.26 0.15 0.08 0.36 0.18 0.03 0.00 0.01 0.04 0.01 0.00 0.00 0.00 0.02 0.00 0.00 0.04 0.00 0.00 0.00 0.07 0.00 0.00 0.03 0.00 0.01 0.09
MINK3045 0.21 0.13 0.08 0.10 0.12 0.09 0.12 0.09 0.12 0.13 0.14 0.13 0.17 0.16 0.17 0.13 0.14 0.11 0.15 0.13 0.14 0.17 0.12 0.14 0.17 0.14 0.11 0.11 0.12 0.10 0.12 0.15 0.14 0.27 1.00 0.26 0.15 0.13 0.21 0.12 0.02 0.00 0.03 0.03 0.00 0.01 0.02 0.02 0.03 0.02 0.03 0.01 0.00 0.00 0.00 0.02 0.03 0.03 0.02 0.00 0.00 0.00
MINK4575 0.25 0.18 0.08 0.11 0.15 0.11 0.15 0.11 0.14 0.15 0.12 0.15 0.20 0.14 0.20 0.21 0.23 0.15 0.12 0.16 0.15 0.26 0.17 0.15 0.17 0.17 0.19 0.19 0.14 0.11 0.15 0.19 0.19 0.26 0.26 1.00 0.12 0.09 0.29 0.18 0.03 0.00 0.04 0.02 0.00 0.00 0.00 0.00 0.05 0.01 0.00 0.03 0.02 0.04 0.03 0.08 0.00 0.00 0.02 0.04 0.00 0.07
MINK7512 0.16 0.10 0.14 0.08 0.08 0.04 0.10 0.09 0.09 0.10 0.08 0.09 0.21 0.09 0.14 0.18 0.19 0.09 0.10 0.14 0.13 0.18 0.11 0.12 0.14 0.11 0.15 0.15 0.08 0.16 0.10 0.17 0.17 0.15 0.15 0.12 1.00 0.14 0.33 0.13 0.06 0.00 0.00 0.00 0.01 0.00 0.00 0.03 0.00 0.00 0.00 0.00 0.00 0.02 0.00 0.04 0.04 0.00 0.00 0.00 0.10 0.06
MINK123M 0.14 0.08 0.13 0.08 0.09 0.09 0.07 0.07 0.07 0.08 0.09 0.08 0.20 0.07 0.11 0.17 0.17 0.06 0.10 0.12 0.11 0.16 0.16 0.09 0.11 0.11 0.11 0.11 0.06 0.14 0.08 0.13 0.13 0.08 0.13 0.09 0.14 1.00 0.37 0.09 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.03 0.00 0.00 0.00 0.00 0.00 0.00
MINKGEM 0.26 0.20 0.13 0.12 0.10 0.11 0.19 0.08 0.23 0.20 0.13 0.13 0.24 0.13 0.21 0.22 0.20 0.09 0.14 0.14 0.17 0.22 0.12 0.13 0.18 0.16 0.19 0.19 0.17 0.15 0.20 0.20 0.21 0.36 0.21 0.29 0.33 0.37 1.00 0.22 0.02 0.00 0.00 0.02 0.00 0.00 0.00 0.00 0.02 0.00 0.00 0.01 0.00 0.02 0.00 0.06 0.00 0.00 0.04 0.00 0.02 0.11
MKOOPKLA 0.95 0.61 0.10 0.10 0.09 0.09 0.17 0.08 0.17 0.16 0.11 0.16 0.21 0.13 0.22 0.21 0.10 0.12 0.12 0.15 0.17 0.18 0.12 0.10 0.18 0.16 0.21 0.21 0.13 0.12 0.18 0.19 0.19 0.18 0.12 0.18 0.13 0.09 0.22 1.00 0.06 0.00 0.05 0.02 0.00 0.03 0.00 0.02 0.05 0.02 0.03 0.03 0.00 0.03 0.00 0.11 0.00 0.02 0.04 0.02 0.04 0.11
PWAPART 0.09 0.07 0.02 0.00 0.02 0.00 0.04 0.00 0.04 0.04 0.00 0.04 0.03 0.01 0.04 0.04 0.01 0.04 0.00 0.02 0.01 0.03 0.01 0.00 0.02 0.03 0.03 0.03 0.04 0.03 0.05 0.05 0.05 0.03 0.02 0.03 0.06 0.00 0.02 0.06 1.00 0.03 0.06 0.10 0.06 0.02 0.00 0.00 0.04 0.00 0.08 0.13 0.00 0.04 0.08 0.36 0.00 0.00 0.00 0.00 0.02 0.10
PWABEDR 0.02 0.02 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.01 0.00 0.00 0.00 0.02 0.00 0.02 0.00 0.00 0.00 0.00 0.00 0.06 0.00 0.00 0.02 0.02 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.03 1.00 0.02 0.03 0.25 0.00 0.27 0.14 0.06 0.33 0.00 0.10 0.00 0.00 0.16 0.13 0.00 0.00 0.00 0.03 0.00 0.00
PWALAND 0.10 0.11 0.01 0.04 0.02 0.03 0.02 0.00 0.04 0.01 0.00 0.03 0.02 0.00 0.03 0.01 0.00 0.12 0.02 0.00 0.00 0.02 0.00 0.02 0.00 0.00 0.04 0.04 0.00 0.02 0.02 0.03 0.02 0.01 0.03 0.04 0.00 0.00 0.00 0.05 0.06 0.02 1.00 0.04 0.01 0.00 0.00 0.06 0.34 0.11 0.00 0.00 0.05 0.00 0.04 0.26 0.00 0.01 0.00 0.00 0.00 0.00
PPERSAUT 0.04 0.05 0.00 0.06 0.01 0.01 0.02 0.00 0.02 0.01 0.01 0.03 0.00 0.03 0.02 0.02 0.02 0.02 0.02 0.03 0.01 0.02 0.00 0.01 0.02 0.00 0.04 0.04 0.00 0.01 0.02 0.02 0.02 0.04 0.03 0.02 0.00 0.00 0.02 0.02 0.10 0.03 0.04 1.00 0.12 0.04 0.09 0.08 0.04 0.08 0.08 0.26 0.00 0.02 0.00 0.13 0.00 0.07 0.03 0.09 0.06 0.18
PBESAUT 0.05 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.01 0.00 0.00 0.00 0.02 0.00 0.00 0.00 0.05 0.00 0.00 0.02 0.00 0.00 0.00 0.00 0.01 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.01 0.00 0.00 0.01 0.00 0.00 0.00 0.06 0.25 0.01 0.12 1.00 0.00 0.17 0.17 0.04 0.24 0.00 0.10 0.00 0.00 0.03 0.09 0.00 0.00 0.00 0.00 0.00 0.00
PMOTSCO 0.05 0.03 0.00 0.00 0.00 0.00 0.03 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.01 0.00 0.00 0.00 0.00 0.03 0.02 0.00 0.00 0.04 0.00 1.00 0.00 0.07 0.00 0.00 0.00 0.00 0.02 0.00 0.00 0.02 0.00 0.00 0.00 0.00 0.02 0.06
PVRAAUT 0.00 0.01 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.01 0.01 0.00 0.00 0.00 0.00 0.00 0.00 0.02 0.00 0.00 0.00 0.00 0.00 0.00 0.27 0.00 0.09 0.17 0.00 1.00 0.22 0.08 0.12 0.00 0.00 0.00 0.00 0.04 0.04 0.00 0.00 0.01 0.00 0.00 0.00
PAANHANG 0.07 0.04 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.06 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.01 0.01 0.00 0.02 0.02 0.00 0.00 0.00 0.02 0.00 0.03 0.00 0.00 0.02 0.00 0.14 0.06 0.08 0.17 0.07 0.22 1.00 0.06 0.15 0.00 0.00 0.00 0.01 0.11 0.05 0.07 0.08 0.00 0.04 0.00 0.00
PTRACTOR 0.12 0.13 0.00 0.02 0.04 0.01 0.02 0.00 0.00 0.02 0.00 0.03 0.02 0.02 0.03 0.01 0.01 0.14 0.03 0.01 0.02 0.02 0.00 0.06 0.02 0.00 0.05 0.05 0.02 0.04 0.00 0.02 0.02 0.02 0.03 0.05 0.00 0.00 0.02 0.05 0.04 0.06 0.34 0.04 0.04 0.00 0.08 0.06 1.00 0.20 0.00 0.00 0.07 0.03 0.05 0.25 0.00 0.00 0.00 0.00 0.01 0.00
PWERKT 0.08 0.07 0.00 0.03 0.02 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 0.13 0.02 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 0.00 0.00 0.00 0.00 0.00 0.02 0.01 0.00 0.00 0.00 0.02 0.00 0.33 0.11 0.08 0.24 0.00 0.12 0.15 0.20 1.00 0.00 0.17 0.13 0.00 0.08 0.22 0.00 0.00 0.00 0.00 0.00 0.00
PBROM 0.08 0.04 0.01 0.02 0.02 0.00 0.00 0.00 0.00 0.00 0.00 0.02 0.03 0.02 0.01 0.00 0.00 0.03 0.00 0.00 0.01 0.01 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.03 0.00 0.00 0.00 0.00 0.03 0.08 0.00 0.00 0.08 0.00 0.00 0.00 0.00 0.00 0.00 1.00 0.00 0.00 0.00 0.00 0.08 0.00 0.00 0.00 0.00 0.00 0.04
PLEVEN 0.00 0.04 0.00 0.00 0.03 0.00 0.01 0.00 0.00 0.00 0.00 0.00 0.00 0.04 0.03 0.02 0.03 0.02 0.00 0.00 0.00 0.03 0.00 0.01 0.00 0.00 0.01 0.01 0.00 0.01 0.00 0.02 0.02 0.04 0.01 0.03 0.00 0.00 0.01 0.03 0.13 0.10 0.00 0.26 0.10 0.00 0.00 0.00 0.00 0.17 0.00 1.00 0.03 0.13 0.00 0.13 0.00 0.00 0.00 0.00 0.04 0.02
PPERSONG 0.00 0.02 0.00 0.01 0.00 0.02 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.03 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 0.00 0.00 0.00 0.00 0.00 0.02 0.02 0.00 0.00 0.02 0.00 0.00 0.00 0.00 0.00 0.00 0.05 0.00 0.00 0.02 0.00 0.00 0.07 0.13 0.00 0.03 1.00 0.00 0.00 0.04 0.00 0.03 0.00 0.00 0.00 0.00
PGEZONG 0.02 0.03 0.00 0.00 0.00 0.00 0.01 0.00 0.02 0.01 0.05 0.02 0.00 0.04 0.04 0.03 0.00 0.00 0.00 0.01 0.00 0.01 0.04 0.00 0.02 0.00 0.03 0.03 0.02 0.00 0.00 0.02 0.01 0.00 0.00 0.04 0.02 0.00 0.02 0.03 0.04 0.00 0.00 0.02 0.00 0.00 0.00 0.01 0.03 0.00 0.00 0.13 0.00 1.00 0.00 0.04 0.00 0.00 0.02 0.01 0.11 0.05
PWAOREG 0.05 0.00 0.00 0.00 0.00 0.01 0.00 0.00 0.00 0.00 0.00 0.03 0.00 0.00 0.00 0.02 0.00 0.00 0.03 0.00 0.00 0.02 0.00 0.25 0.00 0.00 0.02 0.02 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.03 0.00 0.00 0.00 0.00 0.08 0.16 0.04 0.00 0.03 0.00 0.04 0.11 0.05 0.08 0.00 0.00 0.00 0.00 1.00 0.07 0.00 0.00 0.00 0.00 0.00 0.03
PBRAND 0.17 0.16 0.01 0.06 0.03 0.05 0.07 0.03 0.06 0.07 0.03 0.07 0.03 0.03 0.04 0.04 0.08 0.09 0.04 0.03 0.03 0.05 0.03 0.03 0.03 0.04 0.11 0.11 0.04 0.06 0.07 0.05 0.05 0.07 0.02 0.08 0.04 0.03 0.06 0.11 0.36 0.13 0.26 0.13 0.09 0.02 0.04 0.05 0.25 0.22 0.08 0.13 0.04 0.04 0.07 1.00 0.00 0.00 0.04 0.05 0.03 0.15
PZEILPL 0.00 0.03 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.04 0.03 0.07 0.00 0.00 0.00 0.05 0.00 0.02 0.00 0.00 0.00 0.00 0.00 0.09 0.00 0.04 0.04 0.00 0.03 0.00 0.04 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.07 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 1.00 0.29 0.00 0.11 0.00 0.03
PPLEZIER 0.02 0.00 0.00 0.02 0.00 0.00 0.02 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 0.04 0.00 0.00 0.04 0.03 0.03 0.00 0.00 0.00 0.03 0.00 0.00 0.00 0.00 0.02 0.00 0.00 0.01 0.07 0.00 0.00 0.00 0.08 0.00 0.00 0.00 0.00 0.03 0.00 0.00 0.00 0.29 1.00 0.00 0.05 0.00 0.11
PFIETS 0.05 0.03 0.02 0.00 0.02 0.04 0.04 0.02 0.06 0.00 0.02 0.03 0.06 0.04 0.04 0.05 0.02 0.04 0.02 0.04 0.06 0.04 0.04 0.00 0.01 0.00 0.03 0.03 0.05 0.00 0.04 0.02 0.02 0.03 0.02 0.02 0.00 0.00 0.04 0.04 0.00 0.00 0.00 0.03 0.00 0.00 0.01 0.00 0.00 0.00 0.00 0.00 0.00 0.02 0.00 0.04 0.00 0.00 1.00 0.00 0.01 0.02
PINBOED 0.03 0.02 0.00 0.00 0.00 0.00 0.03 0.00 0.02 0.00 0.00 0.02 0.03 0.00 0.03 0.04 0.00 0.00 0.00 0.00 0.00 0.05 0.03 0.00 0.02 0.00 0.00 0.00 0.00 0.00 0.00 0.06 0.06 0.00 0.00 0.04 0.00 0.00 0.00 0.02 0.00 0.03 0.00 0.09 0.00 0.00 0.00 0.04 0.00 0.00 0.00 0.00 0.00 0.01 0.00 0.05 0.11 0.05 0.00 1.00 0.01 0.00
PBYSTAND 0.04 0.05 0.00 0.02 0.00 0.01 0.02 0.00 0.00 0.02 0.03 0.01 0.03 0.03 0.02 0.00 0.03 0.00 0.02 0.00 0.02 0.04 0.00 0.00 0.00 0.00 0.03 0.03 0.01 0.00 0.03 0.03 0.03 0.01 0.00 0.00 0.10 0.00 0.02 0.04 0.02 0.00 0.00 0.06 0.00 0.02 0.00 0.00 0.01 0.00 0.00 0.04 0.00 0.11 0.00 0.03 0.00 0.00 0.01 0.01 1.00 0.07
CARAVAN 0.12 0.12 0.00 0.05 0.03 0.04 0.06 0.01 0.06 0.04 0.00 0.02 0.08 0.04 0.09 0.07 0.00 0.04 0.05 0.06 0.05 0.09 0.05 0.00 0.07 0.06 0.08 0.08 0.07 0.00 0.08 0.05 0.05 0.09 0.00 0.07 0.06 0.00 0.11 0.11 0.10 0.00 0.00 0.18 0.00 0.06 0.00 0.00 0.00 0.00 0.04 0.02 0.00 0.05 0.03 0.15 0.03 0.11 0.02 0.00 0.07 1.00
In [37]:
#Heatmap of correlation matrix 
mask = np.zeros_like(df_vcramer, dtype = np.bool)
mask[np.triu_indices_from(mask)] = True

with sns.axes_style("white"):
    fig, axx = plt.subplots(figsize=(15,15))
    ax = sns.heatmap(df_vcramer, mask = mask, vmin = 0, vmax = 1, square = True, ax = axx)
    
plt.show()

Study of the histograms of the variables VS target variable ('CARAVAN')¶

In [38]:
def plot_feature(df, col_name, isContinuous, target):
    """
    Visualize a variable with and without faceting on the loan status.
    - df dataframe
    - col_name is the variable name in the dataframe
    - full_name is the full variable name
    - continuous is True if the variable is continuous, False otherwise
    """
    f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12,3), dpi=90)
    
    count_null = df[col_name].isnull().sum()
    if isContinuous:
        
        sns.histplot(df.loc[df[col_name].notnull(), col_name], kde=False, ax=ax1)
    else:
        sns.countplot(df[col_name], order=sorted(df[col_name].unique()), color='#5975A4', saturation=1, ax=ax1)
    ax1.set_xlabel(col_name)
    ax1.set_ylabel('Count')
    ax1.set_title(col_name+ ' Number of nulls: '+str(count_null))
    plt.xticks(rotation = 90)


    if isContinuous:
        sns.boxplot(x=col_name, y=target, data=df, ax=ax2)
        ax2.set_ylabel('')
        ax2.set_title(col_name + ' by '+target)
    else:
        data = df.groupby(col_name)[target].value_counts(normalize=True).to_frame('proportion').reset_index() 
        data.columns = [i, target, 'proportion']
        #sns.barplot(x = col_name, y = 'proportion', hue= target, data = data, saturation=1, ax=ax2)
        sns.barplot(x = col_name, y = 'proportion', hue= target, data = data, saturation=1, ax=ax2)
        ax2.set_ylabel(target+' fraction')
        ax2.set_title(target)
        plt.xticks(rotation = 90)
    ax2.set_xlabel(col_name)
    
    plt.tight_layout()
In [39]:
for i in list(df_ticdata.columns):
    if (df_ticdata[i].dtype==float) & (i!='CARAVAN'):
        plot_feature(df_ticdata, col_name=i, isContinuous=True, target='CARAVAN')
    elif  i!='CARAVAN':
        plot_feature(df_ticdata, col_name=i, isContinuous=False, target='CARAVAN')

Conclusion:¶

The most important detail of this case, there are some variables which represented the same information, with the peculiarity some are categoricals and other are numericals.